{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Optimize Model Hyperparameters with SigOpt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import lightgbm as lgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sigopt import Connection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Model-specific parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NUM_FOLDS = 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "SigOpt-specific parameters.\n",
    "\n",
    "Specify an experiment ID only if you want to continue an existing experiment. Otherwise, a new one will be created."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "SIGOPT_EXPERIMENT_ID = None\n",
    "SIGOPT_EXPERIMENT_NAME = 'Master LightGBM Model'\n",
    "SIGOPT_TOKEN = 'YOUR_TOKEN_HERE'\n",
    "\n",
    "NUM_OPTIMIZATION_ITERATIONS = 50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make subsequent runs consistent and reproducible."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RANDOM_SEED = 42\n",
    "np.random.seed(RANDOM_SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_lists = [\n",
    "    'simple_summaries',\n",
    "    'jaccard_ngrams',\n",
    "    'fuzzy',\n",
    "    'tfidf',\n",
    "    'lda',\n",
    "    'nlp_tags',\n",
    "    'wordnet_similarity',\n",
    "    'phrase_embedding',\n",
    "    'wmd',\n",
    "    'wm_intersect',\n",
    "    \n",
    "    '3rdparty_abhishek',\n",
    "    '3rdparty_dasolmar_whq',\n",
    "    '3rdparty_mephistopheies',\n",
    "    '3rdparty_image_similarity',\n",
    "    \n",
    "    'magic_pagerank',\n",
    "    'magic_frequencies',\n",
    "    'magic_cooccurrence_matrix',\n",
    "    \n",
    "    'oofp_nn_mlp_with_magic',\n",
    "    'oofp_nn_cnn_with_magic',\n",
    "    'oofp_nn_bi_lstm_with_magic',\n",
    "    'oofp_nn_siamese_lstm_attention',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_train, df_test, _ = project.load_feature_lists(feature_lists)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = df_train.values\n",
    "X_test = df_test.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = kg.io.load(project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Partition the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "kfold = StratifiedKFold(\n",
    "    n_splits=NUM_FOLDS,\n",
    "    shuffle=True,\n",
    "    random_state=RANDOM_SEED\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set up the experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "conn = Connection(client_token=SIGOPT_TOKEN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if SIGOPT_EXPERIMENT_ID:\n",
    "    experiment = conn.experiments(id=SIGOPT_EXPERIMENT_ID).fetch()\n",
    "\n",
    "else:\n",
    "    experiment = conn.experiments().create(\n",
    "        name=SIGOPT_EXPERIMENT_NAME,\n",
    "        parameters=[\n",
    "            dict(name='feature_fraction', type='double', bounds=(dict(min=0.1, max=1.0))),\n",
    "            dict(name='lambda_l2', type='double', bounds=(dict(min=0.0, max=50.0))),\n",
    "            dict(name='num_leaves', type='int', bounds=(dict(min=8, max=512))),\n",
    "        ],\n",
    "    )\n",
    "    print(\"Created experiment: https://sigopt.com/experiment/\" + experiment.id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def evaluate_model(candidate_params):\n",
    "    cv_scores = []\n",
    "    \n",
    "    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):\n",
    "        X_fold_train = X_train[ix_train]\n",
    "        X_fold_val = X_train[ix_val]\n",
    "\n",
    "        y_fold_train = y_train[ix_train]\n",
    "        y_fold_val = y_train[ix_val]\n",
    "\n",
    "        lgb_params = {\n",
    "            'objective': 'binary',\n",
    "            'metric': 'binary_logloss',\n",
    "            'boosting': 'gbdt',\n",
    "            'device': 'cpu',\n",
    "            'num_leaves': candidate_params['num_leaves'],\n",
    "            'feature_fraction': candidate_params['feature_fraction'],\n",
    "            'lambda_l2': candidate_params['lambda_l2'],\n",
    "            'learning_rate': 0.03,\n",
    "            'num_boost_round': 3000,\n",
    "            'early_stopping_rounds': 5,\n",
    "            'verbose': 1,\n",
    "            'bagging_fraction_seed': RANDOM_SEED,\n",
    "            'feature_fraction_seed': RANDOM_SEED,\n",
    "        }\n",
    "\n",
    "        lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)\n",
    "        lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    \n",
    "        evals_result = {}\n",
    "\n",
    "        model = lgb.train(\n",
    "            lgb_params,\n",
    "            lgb_data_train,\n",
    "            valid_sets=[lgb_data_train, lgb_data_val],\n",
    "            evals_result=evals_result,\n",
    "            num_boost_round=lgb_params['num_boost_round'],\n",
    "            early_stopping_rounds=lgb_params['early_stopping_rounds'],\n",
    "            verbose_eval=False,\n",
    "        )\n",
    "\n",
    "        fold_train_scores = evals_result['training'][lgb_params['metric']]\n",
    "        fold_val_scores = evals_result['valid_1'][lgb_params['metric']]\n",
    "\n",
    "        print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(\n",
    "            fold_num + 1,\n",
    "            len(fold_train_scores),\n",
    "            fold_train_scores[-1],\n",
    "            fold_val_scores[-1],\n",
    "        ))\n",
    "\n",
    "        cv_scores.append(fold_val_scores[-1])\n",
    "    \n",
    "    return -np.mean(cv_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for i in range(NUM_OPTIMIZATION_ITERATIONS):\n",
    "    print(f'Iteration {i + 1} of {NUM_OPTIMIZATION_ITERATIONS}')\n",
    "    suggestion = conn.experiments(experiment.id).suggestions().create()\n",
    "    \n",
    "    print('Suggestion: ')\n",
    "    pprint.pprint(suggestion.assignments)\n",
    "    \n",
    "    score = evaluate_model(suggestion.assignments)\n",
    "    print(f'Score: {score:.6f}')\n",
    "    print()\n",
    "    \n",
    "    conn.experiments(experiment.id).observations().create(\n",
    "        suggestion=suggestion.id,\n",
    "        value=score,\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
